import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
from scipy import stats
import plotly.express as px
import folium
# from keras.models import Sequential
# from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
data = pd.read_csv('Measurement_summary.csv')
data_info = pd.read_csv('Measurement_item_info.csv')
data.head()
| Measurement date | Station code | Station name(district) | Address | Latitude | Longitude | SO2 | NO2 | O3 | CO | PM10 | PM2.5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/1/2017 0:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 0.004 | 0.059 | 0.002 | 1.2 | 73 | 57 |
| 1 | 1/1/2017 1:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 0.004 | 0.058 | 0.002 | 1.2 | 71 | 59 |
| 2 | 1/1/2017 2:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 0.004 | 0.056 | 0.002 | 1.2 | 70 | 59 |
| 3 | 1/1/2017 3:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 0.004 | 0.056 | 0.002 | 1.2 | 70 | 58 |
| 4 | 1/1/2017 4:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 0.003 | 0.051 | 0.002 | 1.2 | 69 | 61 |
data_info.head()
| Item code | Item name | Unit of measurement | Good(Blue) | Normal(Green) | Bad(Yellow) | Very bad(Red) | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | SO2 | ppm | 0.02 | 0.05 | 0.15 | 1.0 |
| 1 | 3 | NO2 | ppm | 0.03 | 0.06 | 0.20 | 2.0 |
| 2 | 5 | CO | ppm | 2.00 | 9.00 | 15.00 | 50.0 |
| 3 | 6 | O3 | ppm | 0.03 | 0.09 | 0.15 | 0.5 |
| 4 | 8 | PM10 | Mircrogram/m3 | 30.00 | 80.00 | 150.00 | 600.0 |
# Create dictionary from the provided information
conditions = {
'SO2': {'Good': (0, 0.02), 'Normal': (0.02, 0.05), 'Bad': (0.05, 0.15), 'Very Bad': (0.15, 1)},
'NO2': {'Good': (0, 0.03), 'Normal': (0.03, 0.06), 'Bad': (0.06, 0.2), 'Very Bad': (0.2, 2)},
'CO': {'Good': (0, 2), 'Normal': (2, 9), 'Bad': (9, 15), 'Very Bad': (15, 50)},
'O3': {'Good': (0, 0.03), 'Normal': (0.03, 0.09), 'Bad': (0.09, 0.15), 'Very Bad': (0.15, 0.5)},
'PM10': {'Good': (0, 30), 'Normal': (30, 80), 'Bad': (80, 150), 'Very Bad': (150, 600)},
'PM2.5': {'Good': (0, 15), 'Normal': (15, 35), 'Bad': (35, 75), 'Very Bad': (75, 500)},
}
# Function to categorize pollutant levels
def categorize(level, condition):
if condition['Good'][0] <= level < condition['Good'][1]:
return 'Good'
elif condition['Normal'][0] <= level < condition['Normal'][1]:
return 'Normal'
elif condition['Bad'][0] <= level < condition['Bad'][1]:
return 'Bad'
else:
return 'Very Bad'
# Apply the function to each pollutant
for pollutant in conditions.keys():
data[f'{pollutant}_Category'] = data[pollutant].apply(lambda x: categorize(x, conditions[pollutant]))
data.sample(5)
| Measurement date | Station code | Station name(district) | Address | Latitude | Longitude | SO2 | NO2 | O3 | CO | PM10 | PM2.5 | SO2_Category | NO2_Category | CO_Category | O3_Category | PM10_Category | PM2.5_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 32391 | 9/28/2017 6:00 | 102 | Jung-gu | 15, Deoksugung-gil, Jung-gu, Seoul, Republic o... | 37.564263 | 126.974676 | 0.003 | 0.027 | 0.005 | 0.4 | 11 | 7 | Good | Good | Good | Good | Good | Good |
| 503046 | 4/2/2018 8:00 | 120 | Dongjak-gu | 6, Sadang-ro 16a-gil, Dongjak-gu, Seoul, Repub... | 37.480917 | 126.971481 | 0.004 | 0.031 | 0.029 | 0.4 | 59 | 27 | Good | Normal | Good | Good | Normal | Normal |
| 191609 | 3/6/2018 17:00 | 108 | Gwangjin-gu | 571, Gwangnaru-ro, Gwangjin-gu, Seoul, Republi... | 37.547180 | 127.092493 | 0.005 | 0.030 | 0.028 | 0.5 | 37 | 19 | Good | Normal | Good | Good | Normal | Normal |
| 158929 | 5/27/2017 20:00 | 107 | Seongdong-gu | 18, Ttukseom-ro 3-gil, Seongdong-gu, Seoul, Re... | 37.541864 | 127.049659 | 0.004 | 0.041 | 0.028 | 0.2 | 91 | 21 | Good | Normal | Good | Good | Bad | Normal |
| 706 | 1/30/2017 10:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 0.004 | 0.010 | 0.033 | 0.3 | 35 | 30 | Good | Good | Good | Normal | Normal | Normal |
# data.to_csv("Measurement_summary_fix.csv")
# Create a dictionary to map categories to numbers
category_to_numeric = {'Good': 1, 'Normal': 2, 'Bad': 3, 'Very Bad': 4}
# Changing categories to be numeric for each category column
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
data[f'{pollutant}_Category'] = data[f'{pollutant}_Category'].map(category_to_numeric)
data.sample(5)
| Measurement date | Station code | Station name(district) | Address | Latitude | Longitude | SO2 | NO2 | O3 | CO | PM10 | PM2.5 | SO2_Category | NO2_Category | CO_Category | O3_Category | PM10_Category | PM2.5_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 361989 | 12/6/2019 19:00 | 114 | Nowon-gu | 17, Sanggye-ro 23-gil, Nowon-gu, Seoul, Republ... | 37.658774 | 127.068505 | 0.006 | 0.045 | 0.004 | 0.8 | 26 | 20 | 1 | 2 | 1 | 1 | 1 | 2 |
| 540517 | 8/9/2019 20:00 | 121 | Gwanak-gu | 14, Sillimdong-gil, Gwanak-gu, Seoul, Republic... | 37.487355 | 126.927102 | 0.004 | 0.026 | 0.010 | 0.2 | 19 | 10 | 1 | 1 | 1 | 1 | 1 | 1 |
| 620338 | 11/8/2019 15:00 | 124 | Songpa-gu | 236, Baekjegobun-ro, Songpa-gu, Seoul, Republi... | 37.502686 | 127.092509 | 0.002 | 0.034 | 0.021 | 0.4 | 29 | 12 | 1 | 2 | 1 | 1 | 1 | 1 |
| 415312 | 2/7/2017 20:00 | 117 | Guro-gu | 45, Gamasan-ro 27-gil, Guro-gu, Seoul, Republi... | 37.498498 | 126.889692 | 0.007 | 0.037 | 0.003 | 0.6 | 54 | 21 | 1 | 2 | 1 | 1 | 2 | 2 |
| 13846 | 7/31/2018 22:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 0.003 | 0.017 | 0.037 | 0.4 | 27 | 18 | 1 | 1 | 1 | 2 | 1 | 2 |
# Convert 'Measurement date' to datetime
data['Measurement date'] = pd.to_datetime(data['Measurement date'])
# Check the data types to confirm the conversion
data.dtypes
Measurement date datetime64[ns] Station code int64 Station name(district) object Address object Latitude float64 Longitude float64 SO2 float64 NO2 float64 O3 float64 CO float64 PM10 int64 PM2.5 int64 SO2_Category int64 NO2_Category int64 CO_Category int64 O3_Category int64 PM10_Category int64 PM2.5_Category int64 dtype: object
data.sample(5)
| Measurement date | Station code | Station name(district) | Address | Latitude | Longitude | SO2 | NO2 | O3 | CO | PM10 | PM2.5 | SO2_Category | NO2_Category | CO_Category | O3_Category | PM10_Category | PM2.5_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 180803 | 2019-12-11 11:00:00 | 107 | Seongdong-gu | 18, Ttukseom-ro 3-gil, Seongdong-gu, Seoul, Re... | 37.541864 | 127.049659 | 0.005 | 0.023 | 0.032 | 0.7 | 112 | 66 | 1 | 1 | 1 | 2 | 3 | 3 |
| 567014 | 2019-09-03 16:00:00 | 122 | Seocho-gu | 16, Sinbanpo-ro 15-gil, Seocho-gu, Seoul, Repu... | 37.504547 | 126.994458 | 0.002 | 0.018 | 0.037 | 0.2 | 12 | 10 | 1 | 1 | 1 | 2 | 1 | 1 |
| 510140 | 2019-01-22 23:00:00 | 120 | Dongjak-gu | 6, Sadang-ro 16a-gil, Dongjak-gu, Seoul, Repub... | 37.480917 | 126.971481 | 0.005 | 0.041 | 0.017 | 0.7 | 76 | 50 | 1 | 2 | 1 | 1 | 2 | 3 |
| 413422 | 2019-11-20 14:00:00 | 116 | Gangseo-gu | 71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re... | 37.544640 | 126.835151 | 0.004 | 0.014 | 0.030 | 0.3 | 16 | 10 | 1 | 1 | 1 | 2 | 1 | 1 |
| 595938 | 2017-01-10 20:00:00 | 124 | Songpa-gu | 236, Baekjegobun-ro, Songpa-gu, Seoul, Republi... | 37.502686 | 127.092509 | 0.005 | 0.017 | 0.025 | 0.2 | 23 | 7 | 1 | 1 | 1 | 1 | 1 | 1 |
# Melt the pollutants and their categories into single columns
data_melted = data.melt(id_vars=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude'],
value_vars=['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5'],
var_name='Air Pollutants', value_name='Measurement Value')
data_melted_categories = data.melt(id_vars=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude'],
value_vars=['SO2_Category', 'NO2_Category', 'CO_Category', 'O3_Category', 'PM10_Category', 'PM2.5_Category'],
var_name='Air Pollutants', value_name='Quality')
data_melted.sample(5)
| Measurement date | Station code | Station name(district) | Address | Latitude | Longitude | Air Pollutants | Measurement Value | |
|---|---|---|---|---|---|---|---|---|
| 45697 | 2019-04-14 23:00:00 | 102 | Jung-gu | 15, Deoksugung-gil, Jung-gu, Seoul, Republic o... | 37.564263 | 126.974676 | SO2 | 0.002 |
| 3749308 | 2019-04-09 10:00:00 | 120 | Dongjak-gu | 6, Sadang-ro 16a-gil, Dongjak-gu, Seoul, Repub... | 37.480917 | 126.971481 | PM2.5 | 11.000 |
| 1424687 | 2017-01-07 21:00:00 | 106 | Mapo-gu | 10, Poeun-ro 6-gil, Mapo-gu, Seoul, Republic o... | 37.555580 | 126.905597 | CO | 1.100 |
| 2899056 | 2019-10-18 03:00:00 | 112 | Gangbuk-gu | 49, Samyang-ro 139-gil, Gangbuk-gu, Seoul, Rep... | 37.647930 | 127.011952 | PM10 | 18.000 |
| 2383615 | 2017-02-02 18:00:00 | 118 | Geumcheon-gu | 20, Geumha-ro 21-gil, Geumcheon-gu, Seoul, Rep... | 37.452357 | 126.908296 | O3 | 0.010 |
data_melted_categories.head(5)
| Measurement date | Station code | Station name(district) | Address | Latitude | Longitude | Air Pollutants | Quality | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-01 00:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2_Category | 1 |
| 1 | 2017-01-01 01:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2_Category | 1 |
| 2 | 2017-01-01 02:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2_Category | 1 |
| 3 | 2017-01-01 03:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2_Category | 1 |
| 4 | 2017-01-01 04:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2_Category | 1 |
# Remove the "_Category" part from the 'Air Pollutants' column in the categories dataframe
data_melted_categories['Air Pollutants'] = data_melted_categories['Air Pollutants'].str.replace('_Category', '')
# Merge the two dataframes on the common columns
data_merged = pd.merge(data_melted, data_melted_categories,
on=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude', 'Air Pollutants'])
data_merged.head(5)
| Measurement date | Station code | Station name(district) | Address | Latitude | Longitude | Air Pollutants | Measurement Value | Quality | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-01 00:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2 | 0.004 | 1 |
| 1 | 2017-01-01 01:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2 | 0.004 | 1 |
| 2 | 2017-01-01 02:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2 | 0.004 | 1 |
| 3 | 2017-01-01 03:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2 | 0.004 | 1 |
| 4 | 2017-01-01 04:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | SO2 | 0.003 | 1 |
# data.to_csv("Measurement_summary_fix_pisan.csv")
summary = data.describe()
summary
| Station code | Latitude | Longitude | SO2 | NO2 | O3 | CO | PM10 | PM2.5 | SO2_Category | NO2_Category | CO_Category | O3_Category | PM10_Category | PM2.5_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 | 647511.000000 |
| mean | 113.000221 | 37.553484 | 126.989340 | -0.001795 | 0.022519 | 0.017979 | 0.509197 | 43.708051 | 25.411995 | 1.020259 | 1.463720 | 1.020129 | 1.355406 | 1.723722 | 1.894979 |
| std | 7.211315 | 0.053273 | 0.078790 | 0.078832 | 0.115153 | 0.099308 | 0.405319 | 71.137342 | 43.924595 | 0.241154 | 0.615827 | 0.240415 | 0.530487 | 0.681677 | 0.800266 |
| min | 101.000000 | 37.452357 | 126.835151 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 25% | 107.000000 | 37.517528 | 126.927102 | 0.003000 | 0.016000 | 0.008000 | 0.300000 | 22.000000 | 11.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 50% | 113.000000 | 37.544962 | 127.004850 | 0.004000 | 0.025000 | 0.021000 | 0.500000 | 35.000000 | 19.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 2.000000 |
| 75% | 119.000000 | 37.584848 | 127.047470 | 0.005000 | 0.038000 | 0.034000 | 0.600000 | 53.000000 | 31.000000 | 1.000000 | 2.000000 | 1.000000 | 2.000000 | 2.000000 | 2.000000 |
| max | 125.000000 | 37.658774 | 127.136792 | 3.736000 | 38.445000 | 33.600000 | 71.700000 | 3586.000000 | 6256.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 |
missing_values = data.isnull().sum()
missing_values
Measurement date 0 Station code 0 Station name(district) 0 Address 0 Latitude 0 Longitude 0 SO2 0 NO2 0 O3 0 CO 0 PM10 0 PM2.5 0 SO2_Category 0 NO2_Category 0 CO_Category 0 O3_Category 0 PM10_Category 0 PM2.5_Category 0 dtype: int64
numerical_columns = ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']
data[numerical_columns].hist(bins=30, figsize=(10, 10), layout=(3, 2));
numerical_columns_category = ['SO2_Category', 'NO2_Category', 'O3_Category', 'CO_Category', 'PM10_Category', 'PM2.5_Category']
data[numerical_columns_category].hist(bins=30, figsize=(10, 10), layout=(3, 2));
numerical_columns_merge = ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5','SO2_Category', 'NO2_Category', 'O3_Category', 'CO_Category', 'PM10_Category', 'PM2.5_Category']
# Visualize the data using boxplots to identify any outliers
fig, axes = plt.subplots(3, 2, figsize=(15, 10))
for ax, pollutant in zip(axes.flatten(), numerical_columns):
sns.boxplot(data=data, x=pollutant, ax=ax)
ax.set_title(f'Boxplot of {pollutant}')
plt.tight_layout()
correlation_matrix = data[numerical_columns].corr()
correlation_matrix
| SO2 | NO2 | O3 | CO | PM10 | PM2.5 | |
|---|---|---|---|---|---|---|
| SO2 | 1.000000 | 0.712422 | 0.805551 | 0.304923 | 0.048573 | 0.047531 |
| NO2 | 0.712422 | 1.000000 | 0.785805 | 0.245746 | 0.055532 | 0.057844 |
| O3 | 0.805551 | 0.785805 | 1.000000 | 0.188998 | 0.038602 | 0.033868 |
| CO | 0.304923 | 0.245746 | 0.188998 | 1.000000 | 0.151166 | 0.182867 |
| PM10 | 0.048573 | 0.055532 | 0.038602 | 0.151166 | 1.000000 | 0.228984 |
| PM2.5 | 0.047531 | 0.057844 | 0.033868 | 0.182867 | 0.228984 | 1.000000 |
# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Pollutants')
plt.show()
correlation_matrix_category = data[numerical_columns_category].corr()
correlation_matrix_category
| SO2_Category | NO2_Category | O3_Category | CO_Category | PM10_Category | PM2.5_Category | |
|---|---|---|---|---|---|---|
| SO2_Category | 1.000000 | 0.298820 | 0.367557 | 0.929548 | 0.240768 | 0.190424 |
| NO2_Category | 0.298820 | 1.000000 | -0.176201 | 0.299551 | 0.418563 | 0.459237 |
| O3_Category | 0.367557 | -0.176201 | 1.000000 | 0.364387 | 0.129657 | 0.083108 |
| CO_Category | 0.929548 | 0.299551 | 0.364387 | 1.000000 | 0.238802 | 0.187704 |
| PM10_Category | 0.240768 | 0.418563 | 0.129657 | 0.238802 | 1.000000 | 0.714912 |
| PM2.5_Category | 0.190424 | 0.459237 | 0.083108 | 0.187704 | 0.714912 | 1.000000 |
# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_category, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Pollutants Category')
plt.show()
correlation_matrix_merge = data[numerical_columns_merge].corr()
correlation_matrix_merge
| SO2 | NO2 | O3 | CO | PM10 | PM2.5 | SO2_Category | NO2_Category | O3_Category | CO_Category | PM10_Category | PM2.5_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SO2 | 1.000000 | 0.712422 | 0.805551 | 0.304923 | 0.048573 | 0.047531 | -0.956390 | -0.300650 | -0.370886 | -0.918129 | -0.231661 | -0.181064 |
| NO2 | 0.712422 | 1.000000 | 0.785805 | 0.245746 | 0.055532 | 0.057844 | -0.629877 | -0.092411 | -0.300536 | -0.648468 | -0.111381 | -0.066652 |
| O3 | 0.805551 | 0.785805 | 1.000000 | 0.188998 | 0.038602 | 0.033868 | -0.729566 | -0.325459 | -0.166421 | -0.749754 | -0.188002 | -0.153828 |
| CO | 0.304923 | 0.245746 | 0.188998 | 1.000000 | 0.151166 | 0.182867 | -0.172866 | 0.252376 | -0.237336 | -0.173522 | 0.208530 | 0.270182 |
| PM10 | 0.048573 | 0.055532 | 0.038602 | 0.151166 | 1.000000 | 0.228984 | -0.034102 | 0.135400 | -0.001572 | -0.038565 | 0.451615 | 0.296372 |
| PM2.5 | 0.047531 | 0.057844 | 0.033868 | 0.182867 | 0.228984 | 1.000000 | -0.028135 | 0.158530 | -0.012344 | -0.032738 | 0.308582 | 0.460219 |
| SO2_Category | -0.956390 | -0.629877 | -0.729566 | -0.172866 | -0.034102 | -0.028135 | 1.000000 | 0.298820 | 0.367557 | 0.929548 | 0.240768 | 0.190424 |
| NO2_Category | -0.300650 | -0.092411 | -0.325459 | 0.252376 | 0.135400 | 0.158530 | 0.298820 | 1.000000 | -0.176201 | 0.299551 | 0.418563 | 0.459237 |
| O3_Category | -0.370886 | -0.300536 | -0.166421 | -0.237336 | -0.001572 | -0.012344 | 0.367557 | -0.176201 | 1.000000 | 0.364387 | 0.129657 | 0.083108 |
| CO_Category | -0.918129 | -0.648468 | -0.749754 | -0.173522 | -0.038565 | -0.032738 | 0.929548 | 0.299551 | 0.364387 | 1.000000 | 0.238802 | 0.187704 |
| PM10_Category | -0.231661 | -0.111381 | -0.188002 | 0.208530 | 0.451615 | 0.308582 | 0.240768 | 0.418563 | 0.129657 | 0.238802 | 1.000000 | 0.714912 |
| PM2.5_Category | -0.181064 | -0.066652 | -0.153828 | 0.270182 | 0.296372 | 0.460219 | 0.190424 | 0.459237 | 0.083108 | 0.187704 | 0.714912 | 1.000000 |
# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_merge, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Pollutants with Category')
plt.show()
# Calculate mean of the positive values for each pollutant
means = {pollutant: data[data[pollutant] >= 0][pollutant].mean() for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']}
# Replace negative values with the mean
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
data[pollutant] = data[pollutant].mask(data[pollutant] < 0).fillna(means[pollutant])
def normalize_data(data):
scaler = MinMaxScaler()
data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
return data_normalized
def standardize_data(data):
scaler = StandardScaler()
data_standardized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
return data_standardized
# Create 'Yearly' column
data['Year'] = data['Measurement date'].dt.year
# Create 'Month' column
data['Month'] = data['Measurement date'].dt.month
# Create 'Quarter' column
data['Quarter'] = data['Measurement date'].dt.quarter
# Extract hour from 'Measurement date'
data['Hour'] = data['Measurement date'].dt.hour
# Calculate the yearly average level of each pollutant
average_pollutant_per_year = data.groupby('Year')[numerical_columns].mean()
average_pollutant_per_year
| SO2 | NO2 | O3 | CO | PM10 | PM2.5 | |
|---|---|---|---|---|---|---|
| Year | ||||||
| 2017 | 0.004627 | 0.029350 | 0.024844 | 0.520167 | 44.455520 | 24.613483 |
| 2018 | 0.004434 | 0.028471 | 0.023495 | 0.502713 | 41.982476 | 24.262548 |
| 2019 | 0.004042 | 0.027979 | 0.024883 | 0.533763 | 45.581123 | 27.952126 |
# Apply the function to each pollutant
for pollutant in conditions.keys():
average_pollutant_per_year[f'{pollutant}_Category'] = average_pollutant_per_year[pollutant].apply(lambda x: categorize(x, conditions[pollutant]))
average_pollutant_per_year
| SO2 | NO2 | O3 | CO | PM10 | PM2.5 | SO2_Category | NO2_Category | CO_Category | O3_Category | PM10_Category | PM2.5_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | ||||||||||||
| 2017 | 0.004627 | 0.029350 | 0.024844 | 0.520167 | 44.455520 | 24.613483 | Good | Good | Good | Good | Normal | Normal |
| 2018 | 0.004434 | 0.028471 | 0.023495 | 0.502713 | 41.982476 | 24.262548 | Good | Good | Good | Good | Normal | Normal |
| 2019 | 0.004042 | 0.027979 | 0.024883 | 0.533763 | 45.581123 | 27.952126 | Good | Good | Good | Good | Normal | Normal |
# Plot the average pollutant levels per year
plt.figure(figsize=(14, 8))
for pollutant in numerical_columns:
plt.plot(average_pollutant_per_year.index, average_pollutant_per_year[pollutant], label=pollutant)
plt.xlabel('Year')
plt.ylabel('Average Level')
plt.title('Yearly Average Level of Each Pollutant')
plt.legend()
plt.grid(True)
plt.show()
# Set 'Measurement date' as index
data.set_index('Measurement date', inplace=True)
# Plot the time-series data for each pollutant
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
data[pollutant].plot(figsize=(10, 5))
plt.title(f'Time-Series Plot of {pollutant} Levels')
plt.ylabel('Level')
plt.show()
# Calculate yearly averages and convert 'Year' to integer
data_yearly = data.groupby('Year')[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']].mean().reset_index()
data_yearly['Year'] = data_yearly['Year'].astype(int)
# Set 'Year' as index again
data_yearly.set_index('Year', inplace=True)
# Normalize and standardize data
data_normalized_yearly = normalize_data(data_yearly)
data_standardized_yearly = standardize_data(data_yearly)
# Time-series plot for each pollutant
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(data_yearly.index, data_yearly[pollutant], label='Original')
ax.plot(data_normalized_yearly.index, data_normalized_yearly[pollutant], label='Normalized')
ax.plot(data_standardized_yearly.index, data_standardized_yearly[pollutant], label='Standardized')
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%d'))
ax.set_xticks(data_yearly.index)
ax.set_xticklabels(data_yearly.index)
ax.set_xlabel('Year')
plt.title(f'Yearly Time-Series Plot of {pollutant} Levels')
plt.ylabel('Average Level')
plt.legend()
plt.show()
data_last_year = data[data['Year'] == data['Year'].max()]
# Time-series plot for each pollutant
plt.figure(figsize=(18, 12))
for i, pollutant in enumerate(numerical_columns, 1):
plt.subplot(3, 2, i)
data_last_year.groupby('Measurement date')[pollutant].mean().plot()
plt.title(pollutant)
plt.xlabel('Time')
plt.ylabel('Average Level')
plt.tight_layout()
plt.show()
data_last_year.sample(5)
| Station code | Station name(district) | Address | Latitude | Longitude | SO2 | NO2 | O3 | CO | PM10 | ... | SO2_Category | NO2_Category | CO_Category | O3_Category | PM10_Category | PM2.5_Category | Year | Month | Quarter | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Measurement date | |||||||||||||||||||||
| 2019-07-23 22:00:00 | 110 | Jungnang-gu | 369, Yongmasan-ro, Jungnang-gu, Seoul, Republi... | 37.584848 | 127.094023 | 0.007 | 0.014 | 0.010 | 0.2 | 15.0 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 2019 | 7 | 3 | 22 |
| 2019-05-09 13:00:00 | 103 | Yongsan-gu | 136, Hannam-daero, Yongsan-gu, Seoul, Republic... | 37.540033 | 127.004850 | 0.005 | 0.036 | 0.051 | 0.4 | 42.0 | ... | 1 | 2 | 1 | 2 | 2 | 2 | 2019 | 5 | 2 | 13 |
| 2019-09-10 23:00:00 | 123 | Gangnam-gu | 426, Hakdong-ro, Gangnam-gu, Seoul, Republic o... | 37.517528 | 127.047470 | 0.002 | 0.013 | 0.072 | 0.3 | 33.0 | ... | 1 | 1 | 1 | 2 | 2 | 2 | 2019 | 9 | 3 | 23 |
| 2019-10-13 05:00:00 | 113 | Dobong-gu | 34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub... | 37.654192 | 127.029088 | 0.002 | 0.013 | 0.016 | 0.4 | 17.0 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 2019 | 10 | 4 | 5 |
| 2019-06-25 03:00:00 | 116 | Gangseo-gu | 71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re... | 37.544640 | 126.835151 | 0.004 | 0.018 | 0.035 | 0.4 | 30.0 | ... | 1 | 1 | 1 | 2 | 2 | 2 | 2019 | 6 | 2 | 3 |
5 rows × 21 columns
# Calculate monthly averages for each year
data_monthly = data.groupby(['Year', 'Month'])[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']].mean().reset_index()
# Normalize and standardize data
data_normalized_monthly = normalize_data(data_monthly[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']])
data_standardized_monthly = standardize_data(data_monthly[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']])
# Time-series plot for each pollutant
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
plt.figure(figsize=(10, 5))
for year in data_monthly['Year'].unique():
monthly_data_for_year = data_monthly[data_monthly['Year'] == year]
normalized_data_for_year = data_normalized_monthly[data_monthly['Year'] == year]
standardized_data_for_year = data_standardized_monthly[data_monthly['Year'] == year]
plt.plot(monthly_data_for_year['Month'], monthly_data_for_year[pollutant], label=f'Original {year}')
# plt.plot(normalized_data_for_year['Month'], normalized_data_for_year[pollutant], label=f'Normalized {year}')
# plt.plot(standardized_data_for_year['Month'], standardized_data_for_year[pollutant], label=f'Standardized {year}')
plt.title(f'Monthly Time-Series Plot of {pollutant} Levels')
plt.xlabel('Month')
plt.ylabel('Average Level')
plt.legend()
plt.show()
data_last_month = data[data['Month'] == data['Month'].max()]
# Time-series plot for each pollutant
plt.figure(figsize=(18, 12))
for i, pollutant in enumerate(numerical_columns, 1):
plt.subplot(3, 2, i)
data_last_month.groupby('Measurement date')[pollutant].mean().plot()
plt.title(pollutant)
plt.xlabel('Time')
plt.ylabel('Average Level')
plt.tight_layout()
plt.show()
data_last_month.sample(5)
| Station code | Station name(district) | Address | Latitude | Longitude | SO2 | NO2 | O3 | CO | PM10 | ... | SO2_Category | NO2_Category | CO_Category | O3_Category | PM10_Category | PM2.5_Category | Year | Month | Quarter | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Measurement date | |||||||||||||||||||||
| 2017-12-06 09:00:00 | 121 | Gwanak-gu | 14, Sillimdong-gil, Gwanak-gu, Seoul, Republic... | 37.487355 | 126.927102 | 0.006 | 0.040 | 0.005 | 0.6 | 30.0 | ... | 1 | 2 | 1 | 1 | 2 | 2 | 2017 | 12 | 4 | 9 |
| 2018-12-19 21:00:00 | 124 | Songpa-gu | 236, Baekjegobun-ro, Songpa-gu, Seoul, Republi... | 37.502686 | 127.092509 | 0.005 | 0.069 | 0.002 | 1.0 | 68.0 | ... | 1 | 3 | 1 | 1 | 2 | 3 | 2018 | 12 | 4 | 21 |
| 2017-12-23 10:00:00 | 108 | Gwangjin-gu | 571, Gwangnaru-ro, Gwangjin-gu, Seoul, Republi... | 37.547180 | 127.092493 | 0.006 | 0.067 | 0.003 | 1.0 | 102.0 | ... | 1 | 3 | 1 | 1 | 3 | 3 | 2017 | 12 | 4 | 10 |
| 2018-12-14 10:00:00 | 105 | Seodaemun-gu | 32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,... | 37.593742 | 126.949679 | 0.007 | 0.019 | 0.015 | 0.5 | 34.0 | ... | 1 | 1 | 1 | 1 | 2 | 2 | 2018 | 12 | 4 | 10 |
| 2019-12-22 09:00:00 | 106 | Mapo-gu | 10, Poeun-ro 6-gil, Mapo-gu, Seoul, Republic o... | 37.555580 | 126.905597 | 0.004 | 0.041 | 0.003 | 0.9 | 50.0 | ... | 1 | 2 | 1 | 1 | 2 | 3 | 2019 | 12 | 4 | 9 |
5 rows × 21 columns
# Calculate quarterly averages
data_quarterly = data.groupby(['Year','Quarter'])[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']].mean().reset_index()
# Normalize and standardize data
data_normalized_quarterly = normalize_data(data_quarterly[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']])
data_standardized_quarterly = standardize_data(data_quarterly[['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']])
# Time-series plot for each pollutant
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
plt.figure(figsize=(10, 5))
for year in data_quarterly['Year'].unique():
quarterly_data_for_year = data_quarterly[data_quarterly['Year'] == year]
quarterly_normalized_data_for_year = data_normalized_quarterly[data_quarterly['Year'] == year]
quarterly_standardized_data_for_year = data_standardized_quarterly[data_quarterly['Year'] == year]
plt.plot(quarterly_data_for_year['Quarter'], quarterly_data_for_year[pollutant], label=f'Original {year}')
# plt.plot(quarterly_normalized_data_for_year['Quarter'], quarterly_normalized_data_for_year[pollutant], label=f'Normalized {year}')
# plt.plot(quarterly_standardized_data_for_year['Quarter'], quarterly_standardized_data_for_year[pollutant], label=f'Standardized {year}')
plt.title(f'Quarterly Time-Series Plot of {pollutant} Levels')
plt.xlabel('Quarterly')
plt.ylabel('Average Level')
plt.legend()
plt.show()
data_last_quaeter = data[data['Quarter'] == data['Quarter'].max()]
# Time-series plot for each pollutant
plt.figure(figsize=(18, 12))
for i, pollutant in enumerate(numerical_columns, 1):
plt.subplot(3, 2, i)
data_last_quaeter.groupby('Measurement date')[pollutant].mean().plot()
plt.title(pollutant)
plt.xlabel('Time')
plt.ylabel('Average Level')
plt.tight_layout()
plt.show()
data_last_quaeter.sample(5)
| Station code | Station name(district) | Address | Latitude | Longitude | SO2 | NO2 | O3 | CO | PM10 | ... | SO2_Category | NO2_Category | CO_Category | O3_Category | PM10_Category | PM2.5_Category | Year | Month | Quarter | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Measurement date | |||||||||||||||||||||
| 2018-11-19 11:00:00 | 109 | Dongdaemun-gu | 43, Cheonho-daero 13-gil, Dongdaemun-gu, Seoul... | 37.575743 | 127.028885 | 0.010000 | 0.071000 | 0.0060 | 1.100000 | 74.000000 | ... | 1 | 3 | 1 | 1 | 2 | 3 | 2018 | 11 | 4 | 11 |
| 2017-12-18 03:00:00 | 116 | Gangseo-gu | 71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re... | 37.544640 | 126.835151 | 0.004372 | 0.028609 | 0.0244 | 0.518663 | 43.983296 | ... | 4 | 4 | 4 | 4 | 4 | 4 | 2017 | 12 | 4 | 3 |
| 2017-12-13 18:00:00 | 102 | Jung-gu | 15, Deoksugung-gil, Jung-gu, Seoul, Republic o... | 37.564263 | 126.974676 | 0.003000 | 0.020000 | 0.0220 | 0.400000 | 28.000000 | ... | 1 | 1 | 1 | 1 | 1 | 2 | 2017 | 12 | 4 | 18 |
| 2018-12-26 22:00:00 | 116 | Gangseo-gu | 71, Gangseo-ro 45da-gil, Gangseo-gu, Seoul, Re... | 37.544640 | 126.835151 | 0.005000 | 0.015000 | 0.0250 | 0.400000 | 28.000000 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 2018 | 12 | 4 | 22 |
| 2018-11-03 01:00:00 | 112 | Gangbuk-gu | 49, Samyang-ro 139-gil, Gangbuk-gu, Seoul, Rep... | 37.647930 | 127.011952 | 0.002000 | 0.041000 | 0.0060 | 0.600000 | 26.000000 | ... | 1 | 2 | 1 | 1 | 1 | 2 | 2018 | 11 | 4 | 1 |
5 rows × 21 columns
# Calculate the monthly average level of each pollutant
average_pollutant_per_month = data.groupby('Month')[numerical_columns].mean()
# Plotting the monthly average level of each pollutant
fig, axes = plt.subplots(3, 2, figsize=(18, 12))
for ax, pollutant in zip(axes.flatten(), numerical_columns):
average_pollutant_per_month[pollutant].plot(ax=ax)
ax.set_title(f'Monthly Average {pollutant} Levels')
ax.set_xlabel('Month')
ax.set_ylabel('Average Level')
plt.tight_layout()
plt.show()
average_pollutant_per_month
| SO2 | NO2 | O3 | CO | PM10 | PM2.5 | |
|---|---|---|---|---|---|---|
| Month | ||||||
| 1 | 0.005217 | 0.036096 | 0.013289 | 0.696928 | 59.507841 | 35.195308 |
| 2 | 0.005122 | 0.034629 | 0.018146 | 0.643196 | 52.947707 | 32.479834 |
| 3 | 0.005072 | 0.037306 | 0.027614 | 0.590821 | 61.714911 | 40.929212 |
| 4 | 0.004398 | 0.029975 | 0.031333 | 0.479555 | 50.543532 | 25.562793 |
| 5 | 0.004561 | 0.027057 | 0.038332 | 0.455426 | 54.104655 | 26.589209 |
| 6 | 0.004067 | 0.022784 | 0.038208 | 0.417441 | 38.082114 | 24.410237 |
| 7 | 0.003942 | 0.019639 | 0.028244 | 0.395531 | 30.469540 | 20.858209 |
| 8 | 0.003783 | 0.018679 | 0.028110 | 0.390576 | 27.074553 | 16.273837 |
| 9 | 0.003704 | 0.022365 | 0.025929 | 0.425334 | 28.163988 | 16.137306 |
| 10 | 0.003785 | 0.026266 | 0.019915 | 0.473149 | 29.954536 | 15.960582 |
| 11 | 0.004429 | 0.035189 | 0.012832 | 0.615757 | 47.963381 | 24.770641 |
| 12 | 0.004482 | 0.034329 | 0.011228 | 0.650039 | 49.122613 | 29.238891 |
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
plt.figure(figsize=(10, 5))
sns.boxplot(x='Month', y=pollutant, data=data)
plt.title(f'Boxplot of {pollutant} Levels by Month')
plt.ylabel('Level')
plt.show()
for pollutant in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
plt.figure(figsize=(10, 5))
sns.boxplot(x='Hour', y=pollutant, data=data)
plt.title(f'Boxplot of {pollutant} Levels by Hour of the Day')
plt.ylabel('Level')
plt.show()
data = data.drop(columns=["Hour"])
# Location analysis: average level of each pollutant at each station
average_pollutant_per_station = data.groupby('Station name(district)')[numerical_columns].mean()
# Bar plots of the average pollutant levels at each station
fig, axes = plt.subplots(3, 2, figsize=(20, 15))
for ax, pollutant in zip(axes.flatten(), numerical_columns):
average_pollutant_per_station[pollutant].sort_values(ascending=False).plot(kind='bar', ax=ax)
ax.set_title(f'Average {pollutant} Levels by Station')
ax.set_ylabel('Average Level')
plt.tight_layout()
plt.show()
average_pollutant_per_station
| SO2 | NO2 | O3 | CO | PM10 | PM2.5 | |
|---|---|---|---|---|---|---|
| Station name(district) | ||||||
| Dobong-gu | 0.004249 | 0.022366 | 0.027007 | 0.577858 | 43.476836 | 25.853363 |
| Dongdaemun-gu | 0.005399 | 0.030024 | 0.022892 | 0.520852 | 39.181641 | 23.230170 |
| Dongjak-gu | 0.003642 | 0.030059 | 0.025239 | 0.473793 | 41.794418 | 24.460256 |
| Eunpyeong-gu | 0.004256 | 0.024903 | 0.028471 | 0.553067 | 42.766211 | 25.158484 |
| Gangbuk-gu | 0.003178 | 0.021713 | 0.029482 | 0.463032 | 38.854965 | 21.739436 |
| Gangdong-gu | 0.004519 | 0.029935 | 0.021883 | 0.519003 | 45.361976 | 24.932772 |
| Gangnam-gu | 0.005250 | 0.029288 | 0.019895 | 0.465135 | 40.026230 | 24.307809 |
| Gangseo-gu | 0.005078 | 0.030313 | 0.026210 | 0.485949 | 54.801959 | 23.563652 |
| Geumcheon-gu | 0.003936 | 0.030880 | 0.023672 | 0.492682 | 39.982541 | 24.904506 |
| Guro-gu | 0.005648 | 0.029656 | 0.028276 | 0.411604 | 51.589377 | 30.869586 |
| Gwanak-gu | 0.004749 | 0.031230 | 0.024400 | 0.464334 | 45.973865 | 28.187193 |
| Gwangjin-gu | 0.004315 | 0.027828 | 0.023457 | 0.628364 | 45.839476 | 29.329425 |
| Jongno-gu | 0.004386 | 0.031628 | 0.024625 | 0.585405 | 38.017699 | 22.893540 |
| Jung-gu | 0.003591 | 0.032298 | 0.025522 | 0.504858 | 37.991307 | 22.877427 |
| Jungnang-gu | 0.005739 | 0.026040 | 0.023252 | 0.479176 | 38.209019 | 22.415185 |
| Mapo-gu | 0.003954 | 0.027271 | 0.024730 | 0.528877 | 47.771956 | 30.426119 |
| Nowon-gu | 0.004557 | 0.027053 | 0.024494 | 0.531892 | 40.433423 | 24.257778 |
| Seocho-gu | 0.004200 | 0.028358 | 0.025662 | 0.434477 | 54.040805 | 29.975835 |
| Seodaemun-gu | 0.004274 | 0.024067 | 0.025601 | 0.599368 | 42.530610 | 23.460195 |
| Seongbuk-gu | 0.003630 | 0.031340 | 0.022527 | 0.645122 | 45.934146 | 25.907135 |
| Seongdong-gu | 0.004412 | 0.028818 | 0.021654 | 0.485073 | 49.910950 | 25.529989 |
| Songpa-gu | 0.004032 | 0.029795 | 0.023431 | 0.569492 | 45.943192 | 24.057421 |
| Yangcheon-gu | 0.004266 | 0.031597 | 0.022208 | 0.524122 | 42.700504 | 25.736788 |
| Yeongdeungpo-gu | 0.004534 | 0.028877 | 0.023665 | 0.575928 | 50.562172 | 31.427645 |
| Yongsan-gu | 0.003516 | 0.029877 | 0.021763 | 0.447149 | 35.891700 | 23.877670 |
# Apply the function to each pollutant
for pollutant in conditions.keys():
average_pollutant_per_station[f'{pollutant}_Category'] = average_pollutant_per_station[pollutant].apply(lambda x: categorize(x, conditions[pollutant]))
average_pollutant_per_station
| SO2 | NO2 | O3 | CO | PM10 | PM2.5 | SO2_Category | NO2_Category | CO_Category | O3_Category | PM10_Category | PM2.5_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Station name(district) | ||||||||||||
| Dobong-gu | 0.004249 | 0.022366 | 0.027007 | 0.577858 | 43.476836 | 25.853363 | Good | Good | Good | Good | Normal | Normal |
| Dongdaemun-gu | 0.005399 | 0.030024 | 0.022892 | 0.520852 | 39.181641 | 23.230170 | Good | Normal | Good | Good | Normal | Normal |
| Dongjak-gu | 0.003642 | 0.030059 | 0.025239 | 0.473793 | 41.794418 | 24.460256 | Good | Normal | Good | Good | Normal | Normal |
| Eunpyeong-gu | 0.004256 | 0.024903 | 0.028471 | 0.553067 | 42.766211 | 25.158484 | Good | Good | Good | Good | Normal | Normal |
| Gangbuk-gu | 0.003178 | 0.021713 | 0.029482 | 0.463032 | 38.854965 | 21.739436 | Good | Good | Good | Good | Normal | Normal |
| Gangdong-gu | 0.004519 | 0.029935 | 0.021883 | 0.519003 | 45.361976 | 24.932772 | Good | Good | Good | Good | Normal | Normal |
| Gangnam-gu | 0.005250 | 0.029288 | 0.019895 | 0.465135 | 40.026230 | 24.307809 | Good | Good | Good | Good | Normal | Normal |
| Gangseo-gu | 0.005078 | 0.030313 | 0.026210 | 0.485949 | 54.801959 | 23.563652 | Good | Normal | Good | Good | Normal | Normal |
| Geumcheon-gu | 0.003936 | 0.030880 | 0.023672 | 0.492682 | 39.982541 | 24.904506 | Good | Normal | Good | Good | Normal | Normal |
| Guro-gu | 0.005648 | 0.029656 | 0.028276 | 0.411604 | 51.589377 | 30.869586 | Good | Good | Good | Good | Normal | Normal |
| Gwanak-gu | 0.004749 | 0.031230 | 0.024400 | 0.464334 | 45.973865 | 28.187193 | Good | Normal | Good | Good | Normal | Normal |
| Gwangjin-gu | 0.004315 | 0.027828 | 0.023457 | 0.628364 | 45.839476 | 29.329425 | Good | Good | Good | Good | Normal | Normal |
| Jongno-gu | 0.004386 | 0.031628 | 0.024625 | 0.585405 | 38.017699 | 22.893540 | Good | Normal | Good | Good | Normal | Normal |
| Jung-gu | 0.003591 | 0.032298 | 0.025522 | 0.504858 | 37.991307 | 22.877427 | Good | Normal | Good | Good | Normal | Normal |
| Jungnang-gu | 0.005739 | 0.026040 | 0.023252 | 0.479176 | 38.209019 | 22.415185 | Good | Good | Good | Good | Normal | Normal |
| Mapo-gu | 0.003954 | 0.027271 | 0.024730 | 0.528877 | 47.771956 | 30.426119 | Good | Good | Good | Good | Normal | Normal |
| Nowon-gu | 0.004557 | 0.027053 | 0.024494 | 0.531892 | 40.433423 | 24.257778 | Good | Good | Good | Good | Normal | Normal |
| Seocho-gu | 0.004200 | 0.028358 | 0.025662 | 0.434477 | 54.040805 | 29.975835 | Good | Good | Good | Good | Normal | Normal |
| Seodaemun-gu | 0.004274 | 0.024067 | 0.025601 | 0.599368 | 42.530610 | 23.460195 | Good | Good | Good | Good | Normal | Normal |
| Seongbuk-gu | 0.003630 | 0.031340 | 0.022527 | 0.645122 | 45.934146 | 25.907135 | Good | Normal | Good | Good | Normal | Normal |
| Seongdong-gu | 0.004412 | 0.028818 | 0.021654 | 0.485073 | 49.910950 | 25.529989 | Good | Good | Good | Good | Normal | Normal |
| Songpa-gu | 0.004032 | 0.029795 | 0.023431 | 0.569492 | 45.943192 | 24.057421 | Good | Good | Good | Good | Normal | Normal |
| Yangcheon-gu | 0.004266 | 0.031597 | 0.022208 | 0.524122 | 42.700504 | 25.736788 | Good | Normal | Good | Good | Normal | Normal |
| Yeongdeungpo-gu | 0.004534 | 0.028877 | 0.023665 | 0.575928 | 50.562172 | 31.427645 | Good | Good | Good | Good | Normal | Normal |
| Yongsan-gu | 0.003516 | 0.029877 | 0.021763 | 0.447149 | 35.891700 | 23.877670 | Good | Good | Good | Good | Normal | Normal |
# Function to map categories to colors
def map_color(category):
if category == 1:
return 'green'
elif category == 2:
return 'blue'
elif category == 3:
return 'yellow'
else:
return 'red'
# Create a map centered at an average latitude and longitude of the stations
# map_folium = folium.Map(location=[data['Latitude'].mean(), data['Longitude'].mean()], zoom_start=10)
# For each station, create a circle marker
# for idx, row in data.iterrows():
# Convert the level of a pollutant to a category
# category = categorize(row['PM2.5'], conditions['PM2.5'])
# Convert the category to a color
# color = map_color(category)
# Adjust the size of the marker based on the level of the pollutant
# size = row['PM2.5'] / 10
# folium.CircleMarker(location=[row['Latitude'], row['Longitude']], radius=size, color=color, fill=True).add_to(map_folium)
# Show the map
# map_folium
# THE MAP NOT SHOWING IN COLAB
fig = px.scatter_geo(data_merged, lat='Latitude', lon='Longitude', color='Quality')
fig.show()
# THE MAP NOT SHOWING IN COLAB
# Scatter plots between pairs of pollutants
fig, axes = plt.subplots(3, 2, figsize=(15, 10))
for ax, (pollutant1, pollutant2) in zip(axes.flatten(), [('SO2', 'NO2'), ('SO2', 'O3'), ('NO2', 'O3'), ('PM10', 'PM2.5'), ('SO2', 'PM2.5'), ('NO2', 'PM10')]):
sns.scatterplot(data=data.sample(frac=0.01, random_state=1), x=pollutant1, y=pollutant2, ax=ax)
ax.set_title(f'{pollutant1} vs {pollutant2}')
plt.tight_layout()
plt.show()
# Visualize the distribution of each pollutant using histograms and QQ plots
fig, axes = plt.subplots(6, 2, figsize=(15, 20))
for i, pollutant in enumerate(numerical_columns):
# Histogram
sns.histplot(data=data, x=pollutant, kde=True, ax=axes[i, 0])
axes[i, 0].set_title(f'Histogram of {pollutant}')
# QQ plot
stats.probplot(data[pollutant], plot=axes[i, 1])
axes[i, 1].set_title(f'QQ Plot of {pollutant}')
plt.tight_layout()
plt.show()
# Due to the large size of the dataset, Sample a subset of the data for the pairplot
sample_data = data.sample(frac=0.1, random_state=1)
sns.pairplot(sample_data[numerical_columns])
plt.show()
# data.set_index('Measurement date', inplace=True)
# Resample to daily frequency
data_daily = data.resample('D').mean()
# Resample to Monthly frequency
data_Month = data.resample('M').mean()
C:\Users\Fajri\AppData\Local\Temp\ipykernel_14984\3108603987.py:4: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. C:\Users\Fajri\AppData\Local\Temp\ipykernel_14984\3108603987.py:7: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
# Prepare sequence data
def create_sequences(data, seq_length):
Xs, ys = [], []
for i in range(len(data)-seq_length):
Xs.append(data[i:(i+seq_length)])
ys.append(data[i+seq_length])
return np.array(Xs), np.array(ys)
# Select the pollutants and convert to numpy array
pollutants = ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']
data_pollutants = data[pollutants].values
# Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data_pollutants)
# Choose sequence length
seq_length = 5
# Create sequences
X, y = create_sequences(data_scaled, seq_length)
# Split the data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=False)
# data = data.drop(columns=["level_0","index"])
data = data.reset_index()
# Melt the pollutants and their categories into single columns
data_melted = data.melt(id_vars=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude','Year','Month','Quarter'],
value_vars=['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5'],
var_name='Air Pollutants', value_name='Measurement Value')
data_melted_categories = data.melt(id_vars=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude','Year','Month','Quarter'],
value_vars=['SO2_Category', 'NO2_Category', 'CO_Category', 'O3_Category', 'PM10_Category', 'PM2.5_Category'],
var_name='Air Pollutants', value_name='Quality')
# Remove the "_Category" part from the 'Air Pollutants' column in the categories dataframe
data_melted_categories['Air Pollutants'] = data_melted_categories['Air Pollutants'].str.replace('_Category', '')
# Merge the two dataframes on the common columns
data_merged = pd.merge(data_melted, data_melted_categories,
on=['Measurement date', 'Station code', 'Station name(district)', 'Address', 'Latitude', 'Longitude','Year','Month','Quarter','Air Pollutants'])
data_merged.head(5)
| Measurement date | Station code | Station name(district) | Address | Latitude | Longitude | Year | Month | Quarter | Air Pollutants | Measurement Value | Quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-01 00:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 2017 | 1 | 1 | SO2 | 0.004 | 1 |
| 1 | 2017-01-01 01:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 2017 | 1 | 1 | SO2 | 0.004 | 1 |
| 2 | 2017-01-01 02:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 2017 | 1 | 1 | SO2 | 0.004 | 1 |
| 3 | 2017-01-01 03:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 2017 | 1 | 1 | SO2 | 0.004 | 1 |
| 4 | 2017-01-01 04:00:00 | 101 | Jongno-gu | 19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ... | 37.572016 | 127.005008 | 2017 | 1 | 1 | SO2 | 0.003 | 1 |
# Create a dictionary to map numbers to categories
numeric_to_category = {1 : 'Good', 2 : 'Normal', 3 : 'Bad', 4 : 'Very Bad'}
# Changing categories to be numeric for each category column
for pollutant in ['Quality']:
data_merged[f'{pollutant}'] = data_merged[f'{pollutant}'].map(numeric_to_category)
data_merged.sample(5)
| Measurement date | Station code | Station name(district) | Address | Latitude | Longitude | Year | Month | Quarter | Air Pollutants | Measurement Value | Quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 960639 | 2017-04-08 03:00:00 | 113 | Dobong-gu | 34, Sirubong-ro 2-gil, Dobong-gu, Seoul, Repub... | 37.654192 | 127.029088 | 2017 | 4 | 2 | NO2 | 0.037 | Normal |
| 1870534 | 2017-08-27 02:00:00 | 123 | Gangnam-gu | 426, Hakdong-ro, Gangnam-gu, Seoul, Republic o... | 37.517528 | 127.047470 | 2017 | 8 | 3 | CO | 0.400 | Good |
| 627508 | 2017-09-03 21:00:00 | 125 | Gangdong-gu | 59, Gucheonmyeon-ro 42-gil, Gangdong-gu, Seoul... | 37.544962 | 127.136792 | 2017 | 9 | 3 | SO2 | 0.004 | Good |
| 456870 | 2018-11-22 14:00:00 | 118 | Geumcheon-gu | 20, Geumha-ro 21-gil, Geumcheon-gu, Seoul, Rep... | 37.452357 | 126.908296 | 2018 | 11 | 4 | SO2 | 0.004 | Good |
| 49679 | 2019-10-03 17:00:00 | 102 | Jung-gu | 15, Deoksugung-gil, Jung-gu, Seoul, Republic o... | 37.564263 | 126.974676 | 2019 | 10 | 4 | SO2 | 0.003 | Good |
# data_merged.to_csv("Measurement_summary_fix_pisan_bangets.csv",index=False)